rm(list=ls())
library(survival)
library(survminer)
library(preprocessCore)
library(plyr)
library(ggpubr)

projects = TCGAbiolinks:::getGDCprojects()$project_id
ind = grep("TCGA",projects)
projects=projects[ind]

path = "Z:/Bioinformatics/ExternalDatabases/TCGAbiolinksAnalysis/UnnormalizedData/"
stor=c()
Gene = c("LAIR1")

for(project in projects){
  datPath = paste(path,paste(project,"Data.csv",sep=""),sep="/")
  metDatPath = paste(path,paste(project,"Metadata.csv",sep=""),sep="/")
  metadata = read.csv(metDatPath)
  dat = read.csv(datPath)

  ind = which(metadata$tissue.definition=="Primary solid Tumor")
  tumor = metadata$cases[ind]
  tumor = unlist(lapply(tumor,function(x){aa=gsub("-","\\.",x);return(aa)}))
  genes = apply(as.matrix(dat$X),1,function(x){aa=unlist(strsplit(x,"\\|"));return(aa[1])})
  ind = which(genes%in%Gene)
  tumorDat = dat[ind,]
  genes = apply(as.matrix(tumorDat$X),1,function(x){aa=unlist(strsplit(x,"\\|"));return(aa[1])})
  tumorDat$X = NULL
  tumorDat = t(tumorDat)
  colnames(tumorDat) = genes
  tumorDat = data.frame(tumorDat)
  tumorDat$proj = project
  
  stor=rbind(stor,tumorDat)

}


numericDat = stor
numericDat <- data.frame(apply(numericDat, 2, function(x) as.numeric(as.character(x))))
numericDat$proj = stor$proj
numericDat$patIds=row.names(stor)

stor1=c()
Gene = c("COL1A1","COL1A2","COL2A1","COL3A1","COL4A1","COL4A2","COL4A3","COL4A4","COL4A5","COL4A6","COL5A1","COL5A2","COL5A3","COL6A1","COL6A2","COL6A3","COL6A5","COL7A1","COL8A1","COL8A2","COL9A1","COL9A2","COL9A3","COL10A1","COL11A1","COL11A2","COL12A1","COL13A1","COL14A1","COL15A1","COL16A1","COL17A1","COL18A1","COL19A1","COL20A1","COL21A1","COL22A1","COL23A1","COL24A1","COL25A1","EMID2","COL27A1","COL28A1","COL29A1")

for(project in projects){
  datPath = paste(path,paste(project,"Data.csv",sep=""),sep="/")
  metDatPath = paste(path,paste(project,"Metadata.csv",sep=""),sep="/")
  metadata = read.csv(metDatPath)
  dat = read.csv(datPath)
  
  ind = which(metadata$tissue.definition=="Primary solid Tumor")
  tumor = metadata$cases[ind]
  tumor = unlist(lapply(tumor,function(x){aa=gsub("-","\\.",x);return(aa)}))
  genes = apply(as.matrix(dat$X),1,function(x){aa=unlist(strsplit(x,"\\|"));return(aa[1])})
  ind = which(genes%in%Gene)
  tumorDat = dat[ind,]
  genes = apply(as.matrix(tumorDat$X),1,function(x){aa=unlist(strsplit(x,"\\|"));return(aa[1])})
  tumorDat$X = NULL
  tumorDat = t(tumorDat)
  colnames(tumorDat) = genes
  tumorDat = data.frame(tumorDat)
  tumorDat$proj = project
  
  stor1=rbind(stor1,tumorDat)
  
}


numericDat1 = stor1
numericDat1 <- data.frame(apply(numericDat1, 2, function(x) as.numeric(as.character(x))))
mn = apply(as.matrix(numericDat1),1,function(x){return(mean(x,na.rm=T))})
numericDat1 = as.data.frame(mn)
numericDat1$proj = stor1$proj
colnames(numericDat1)[1]="Collagen"

#numericDat = join(numericDat,numericDat1,by="proj")

numericDat=join(numericDat,numericDat1)


#interestingProjects = c("TCGA-COAD","TCGA-HNSC","TCGA-GBM","TCGA-LGG","TCGA-SKCM","TCGA-LUAD","TCGA-LUSC","TCGA-PAAD","TCGA-STAD","TCGA-OV")
#ind = which(projects%in%interestingProjects)
#projects = projects[ind]



  idsMod = unlist(lapply(as.vector(numericDat$patIds), function(x) {
    aa = unlist(strsplit(x, "\\."))
    bb = paste(aa[1], aa[2], aa[3], sep = "-")
    return(bb)
  }))
  numericDat$modPatIds = idsMod
  
  
  
  plts <- vector("list", length(projects)*(ncol(numericDat)-1))
  counter = 1
  labss=c()
  number_low=c()
  number_high=c()
  survD=c()
  
  for(project in projects){
    ind = which(numericDat$proj==project)
    AvgCollagenExpression_now = numericDat[ind,]
    
    #PROCESS THE CLINICAL DATA.....
    
    clinicalData = read.csv(paste(path, project, "Clinical.csv", sep = ""))
    
    ii = grep("bcr_patient_barcode", colnames(clinicalData))
    
    clinIDs = toupper(clinicalData$bcr_patient_barcode)

    
    #z_n = apply(as.matrix(numDat),2,function(x){mn = mean(x,na.rm = T);std = sd(x,na.rm = T);aa=((x-mn)/std);return(aa)})
    #colnames(z_n)=idsMod[2:length(idsMod)]
    
    
    #New tumor event after initial treatment
    ind_keep <-
      grep('days_to_new_tumor_event_after_initial_treatment',
           colnames(clinicalData))
    new_tum <- as.matrix(clinicalData[, ind_keep])
    new_tum_collapsed <- c()
    for (i in 1:dim(new_tum)[1]) {
      if (sum (is.na(new_tum[i, ])) < dim(new_tum)[2]) {
        m <- min(new_tum[i, ], na.rm = T)
        new_tum_collapsed <- c(new_tum_collapsed, m)
      } else {
        new_tum_collapsed <- c(new_tum_collapsed, 'NA')
      }
    }
    
    # do the same to death
    ind_keep <- grep('days_to_death', colnames(clinicalData))
    death <- as.matrix(clinicalData[, ind_keep])
    death_collapsed <- c()
    for (i in 1:dim(death)[1]) {
      if (sum (is.na(death[i, ])) < dim(death)[2]) {
        m <- max(death[i, ], na.rm = T)
        death_collapsed <- c(death_collapsed, m)
      } else {
        death_collapsed <- c(death_collapsed, 'NA')
      }
    }
    
    # and days last follow up here we take the most recent which is the max number
    ind_keep <- grep('days_to_last_follow_up', colnames(clinicalData))
    fl <- as.matrix(clinicalData[, ind_keep])
    fl_collapsed <- c()
    for (i in 1:dim(fl)[1]) {
      if (sum (is.na(fl[i, ])) < dim(fl)[2]) {
        m <- max(fl[i, ], na.rm = T)
        fl_collapsed <- c(fl_collapsed, m)
      } else {
        fl_collapsed <- c(fl_collapsed, 'NA')
      }
    }
    
    
    # and put everything together
    all_clin <-
      data.frame(new_tum_collapsed, death_collapsed, fl_collapsed)
    colnames(all_clin) <-
      c('new_tumor_days', 'death_days', 'followUp_days')
    
    
    # create vector with time to new tumor containing data to censor for new_tumor
    all_clin$new_time <- c()
    for (i in 1:length(as.numeric(as.character(all_clin$new_tumor_days)))) {
      all_clin$new_time[i] <-
        ifelse (is.na(as.numeric(
          as.character(all_clin$new_tumor_days)
        )[i]),
        as.numeric(as.character(all_clin$followUp_days))[i],
        as.numeric(as.character(all_clin$new_tumor_days))[i])
    }
    
    # create vector time to death containing values to censor for death
    all_clin$new_death <- c()
    for (i in 1:length(as.numeric(as.character(all_clin$death_days)))) {
      all_clin$new_death[i] <-
        ifelse (is.na(as.numeric(as.character(
          all_clin$death_days
        ))[i]),
        as.numeric(as.character(all_clin$followUp_days))[i],
        as.numeric(as.character(all_clin$death_days))[i])
    }
    
    # create vector for death censoring
    table(clinicalData$vital_status)
    
    
    all_clin$death_event <-
      ifelse(clinicalData$vital_status == 'Alive', 0, 1)
    
    #finally add row.names to clinical
    rownames(all_clin) <- toupper(clinicalData$bcr_patient_barcode)
    all_clin$Age = clinicalData$age_at_index
    
    
    
    all_clin$modPatIds = row.names(all_clin)
    AvgCollagenExpression_now = join(AvgCollagenExpression_now,all_clin, by="modPatIds")
    
    ind = which(is.na(AvgCollagenExpression_now$new_death))
    if(length(ind)>0){
      AvgCollagenExpression_now = AvgCollagenExpression_now[-ind,]
    }
    
    quan1 = quantile(AvgCollagenExpression_now$LAIR1)
    quan2 = quantile(AvgCollagenExpression_now$Collagen)
    ind = which((AvgCollagenExpression_now$LAIR1>=quan1[3])&(AvgCollagenExpression_now$Collagen>=quan2[3]))
    ind1 = which((AvgCollagenExpression_now$LAIR1<quan1[3])&(AvgCollagenExpression_now$Collagen<quan2[3]))
    #ind = unique(c(ind,ind1))
    AvgCollagenExpression_now = AvgCollagenExpression_now[unique(c(ind,ind1)),]

    event_rna <-
      ifelse((AvgCollagenExpression_now$LAIR1>=quan1[3])&(AvgCollagenExpression_now$Collagen>=quan2[3]),
             "HighExpression",
             "LowExpression")
    if(length(unique(event_rna))>1){
      cox.ph <-
        coxph(Surv((AvgCollagenExpression_now$new_death / 30), AvgCollagenExpression_now$death_event) ~ event_rna, data = AvgCollagenExpression_now)
      coeffs = coef(summary(cox.ph))
      
      
      #dff = data.frame(paste("HR(high):", round(coeffs[2], digits = 2), sep = ""))
     # dff$Prhr = paste("pr(HR):", round(coeffs[5], digits = 2), sep = "")
      #dff$nhigh = paste("n(high):", length(ind1), sep = "")
      #dff$nlow = paste("n(low):", length(ind), sep = "")
     # txt <- apply(dff, 1, paste, sep = '\n')
      
      
       txt = paste(
        paste("HR(high):", round(coeffs[2], digits = 2), sep = ""),
        paste("pr(HR):", round(coeffs[5], digits = 2), sep = ""),
        sep =  " "
      )
      #survD = rbind(survD,c(project,round(coeffs[2], digits = 2),round(coeffs[5], digits = 2),length(ind1),length(ind)))
       survD = rbind(survD,c(project,coeffs[2],coeffs[5],length(ind1),length(ind)))
      
     # txt = paste(
      #  paste("HR(high):", round(coeffs[2], digits = 2), sep = ""),
      #  paste("pr(HR):", round(coeffs[5], digits = 2), sep = ""),
      #  paste("n(high):", length(ind1), sep = ""),
      #  paste("n(low):", length(ind), sep = ""),
      #  sep =  "\n"
      #)
      
      fit <-
        survfit(Surv((AvgCollagenExpression_now$new_death / 30), AvgCollagenExpression_now$death_event) ~ event_rna, data = AvgCollagenExpression_now)
      x_q = quantile(na.exclude((AvgCollagenExpression_now$new_death / 30)))
      pval = surv_pvalue(fit)$pval
      #if(pval<0.09){
       # gp = ggsurvplot(fit, data = AvgCollagenExpression_now, pval = T,pval.size = 5,font.tickslab = c(10, "plain", "black"))
      #gp = ggsurvplot(fit, data = AvgCollagenExpression_now, pval = txt,pval.size = 5,font.tickslab = c(10, "plain", "black"),palette = c("red", "black"),legend="none", pval.coord = c(0, 0.03))
      gp = ggsurvplot(fit, data = AvgCollagenExpression_now, pval = F,pval.size = 5,font.tickslab = c(10, "plain", "black"),palette = c("red", "black"),legend="none")
        gp$plot = gp$plot +theme(axis.title.x=element_blank(),axis.title.y=element_blank(),legend.position="none")
        plts[counter] = gp
        counter= counter+1
        project = gsub("TCGA-","",project)
        labss = c(labss,paste("Collagen+LAIR1",project,sep="_"))
      #}
    }
    

  }

  colnames(survD)=c("project","HR(HIGH)","pr(HR)","n(high)","n(low)")
  plts1 = plts[1:length(labss)]
labss1 = apply(as.matrix(labss),1,function(x){aa=unlist(strsplit(x,"_"))[2];return(aa)})

#write.csv(survD,"U:/NC410Manuscript/Results/revision/SurvivalStatsCollagenLAIR1ComboNew.csv")

ind = which(labss1%in%c("LGG","PAAD","ESCA","LAML","KIRC","GBM","UVM","LIHC","BRCA","LUSC"))

plts1=plts1[ind]
labss1=labss1[ind]

#figure <- ggarrange(plotlist=plts1,ncol = 5, nrow =2,labels=labss1,font.label = list(size = 20, color = "black"),label.y=1,label.x = 0.5)
figure <- ggarrange(plotlist=plts1,ncol = 5, nrow =2)

png("U:/NC410Manuscript/Results/revision/SurvCollagenLAIR_Selected.png", width = 14, height = 7, units = 'in', res = 300)
annotate_figure(figure,
                bottom = text_grob("Time in Months", color = "black",size = 30),
                left = text_grob("Survival Probability", color = "black", rot = 90,size=30)
)
dev.off()


#figure <- ggarrange(plotlist=plts1,ncol = 6, nrow =6,labels=labss1,font.label = list(size = 10, color = "black"),label.y=0.5,label.x = 0.5)

#png("U:/NC410Manuscript/Results/revision/SurvCollagenLAIR1.png", width = 14, height = 10, units = 'in', res = 300)
#annotate_figure(figure,
#                bottom = text_grob("Time in Months", color = "black",size = 30),
#                left = text_grob("Survival Probability", color = "black", rot = 90,size=30)
#)
#dev.off()






